# General packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# for scaling the data
from sklearn.preprocessing import StandardScaler
# for clustering
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
# Set display options to show all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
# Set the ggplot style
plt.style.use('ggplot')
# Load the dataset
Data = pd.read_csv('Documents/Mastry_Project2/FinalSQL.csv')
# Excluding user_id column as it doesn't make sense to get its statistical summary
selected_columns = cohort_users.columns.difference(['user_id'])
# Generate the statistical summary for selected columns
cohort_users[selected_columns].describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| age | 5998.0 | 41.014672 | 12.046715 | 16.00 | 34.0000 | 41.000 | 48.0000 | 88.00 |
| avg_booking_departure_gap_days_flights | 5206.0 | 21.818417 | 48.355668 | 2.35 | 6.6100 | 7.645 | 9.0200 | 365.08 |
| avg_booking_departure_gap_days_hotels | 5435.0 | 12.981943 | 27.731505 | 1.50 | 6.5800 | 7.730 | 9.0500 | 329.83 |
| avg_checked_bags | 5206.0 | 0.603338 | 0.512678 | 0.00 | 0.2500 | 0.500 | 1.0000 | 6.60 |
| avg_distance_flown_km | 5206.0 | 2348.176104 | 1749.255961 | 27.52 | 1476.5125 | 2000.040 | 2696.1900 | 15954.76 |
| avg_flight_discount_amount | 4721.0 | 0.141010 | 0.070932 | 0.05 | 0.1000 | 0.130 | 0.1800 | 0.55 |
| avg_flight_price_usd | 5206.0 | 550.205344 | 739.590156 | 5.35 | 282.7750 | 392.795 | 565.9625 | 14280.38 |
| avg_flight_seats | 5206.0 | 1.222578 | 0.454272 | 1.00 | 1.0000 | 1.000 | 1.3300 | 6.60 |
| avg_hotel_discount_amount | 4063.0 | 0.112437 | 0.054025 | 0.05 | 0.0800 | 0.100 | 0.1500 | 0.45 |
| avg_hotel_price_usd | 5435.0 | 178.272981 | 84.673031 | 24.00 | 124.3300 | 163.000 | 211.8350 | 1063.00 |
| avg_hotel_rooms | 5435.0 | 1.198267 | 0.352250 | 1.00 | 1.0000 | 1.000 | 1.3300 | 4.00 |
| avg_page_clicks | 5998.0 | 18.782508 | 11.969869 | 4.13 | 12.5600 | 15.670 | 19.8150 | 163.90 |
| avg_session_duration_minute | 5998.0 | 1.780123 | 1.602640 | 0.00 | 1.1000 | 1.440 | 1.8900 | 16.40 |
| avg_stay_duration_day | 5435.0 | 3.804942 | 3.007981 | -1.00 | 2.0000 | 3.000 | 5.0000 | 30.00 |
| cancellation_proportion | 5566.0 | 0.047528 | 0.160093 | 0.00 | 0.0000 | 0.000 | 0.0000 | 1.00 |
| conversion_rate | 5998.0 | 0.333846 | 0.185690 | 0.00 | 0.2200 | 0.330 | 0.5000 | 1.00 |
| discounted_flight_proportion | 5998.0 | 0.187049 | 0.136935 | 0.00 | 0.1100 | 0.130 | 0.2500 | 0.70 |
| discounted_hotel_proportion | 5998.0 | 0.145782 | 0.126332 | 0.00 | 0.0000 | 0.130 | 0.2500 | 0.75 |
| round_trips_proportion | 5998.0 | 0.827904 | 0.351102 | 0.00 | 1.0000 | 1.000 | 1.0000 | 1.00 |
| session_count | 5998.0 | 8.427309 | 0.790536 | 8.00 | 8.0000 | 8.000 | 9.0000 | 14.00 |
| total_flights_booked | 5998.0 | 2.397966 | 1.577156 | 0.00 | 1.0000 | 2.000 | 3.0000 | 9.00 |
| total_hotels_booked | 5998.0 | 2.527509 | 1.511832 | 0.00 | 1.0000 | 2.000 | 4.0000 | 8.00 |
| total_trips | 5998.0 | 2.805435 | 1.589629 | 0.00 | 2.0000 | 3.000 | 4.0000 | 9.00 |
| weekend_trip_proportion | 5998.0 | 0.049298 | 0.137947 | 0.00 | 0.0000 | 0.000 | 0.0000 | 1.00 |
# flagging users based on flight bookings:True if the user has booked a flight and False if hasn't
cohort_users['has_booked'] = ~cohort_users['cancellation_proportion'].isna()
# filling cancellation_proportion column null values with 0
cohort_users['cancellation_proportion'].fillna(0, inplace=True)
# Determine the position to insert the new column after the last_min_booking columns
position = cohort_users.columns.get_loc('avg_booking_departure_gap_days_hotels') + 1
# Insert the new column which is the maximum of the both columns at the desired position
cohort_users.insert(position, 'avg_booking_departure_gap_days',
cohort_users[['avg_booking_departure_gap_days_flights', 'avg_booking_departure_gap_days_hotels']].max(axis=1))
# Drop the original columns
cohort_users.drop(columns=['avg_booking_departure_gap_days_flights', 'avg_booking_departure_gap_days_hotels'], inplace=True)
# check the number of null values of the new column 'avg_booking_departure_gap_days'
null_values = cohort_users['avg_booking_departure_gap_days'].isna().sum()
print(f'We have {null_values} null values in avg_booking_departure_gap_days column.')
We have 432 null values in avg_booking_departure_gap_days column.
max_gap = cohort_users['avg_booking_departure_gap_days'].max()
cohort_users['avg_booking_departure_gap_days'].fillna(max_gap, inplace=True)
# Flag users who haven't booked a flight
cohort_users['has_booked_flight'] = ~cohort_users['avg_flight_price_usd'].isna()
# Columns to fill with 0
zero_fill_columns = ['avg_flight_discount_amount', 'avg_flight_seats', 'avg_checked_bags']
# Columns to fill with their medians
median_fill_columns = ['avg_flight_price_usd', 'avg_distance_flown_km']
# filling null values with zero
for col in zero_fill_columns:
cohort_users[col].fillna(0, inplace=True)
# filling null values with median
for col in median_fill_columns:
cohort_users[col].fillna(cohort_users[col].median(), inplace=True)
# Flag users who haven't booked a hotel
cohort_users['has_booked_hotel'] = ~cohort_users['avg_hotel_price_usd'].isna()
# Columns to fill with 0
zero_fill_columns = ['avg_hotel_discount_amount', 'avg_hotel_rooms', 'avg_stay_duration_day']
# Columns to fill with their medians
median_fill_columns = ['avg_hotel_price_usd']
# filling null values with zero
for col in zero_fill_columns:
cohort_users[col].fillna(0, inplace=True)
# filling null values with median
for col in median_fill_columns:
cohort_users[col].fillna(cohort_users[col].median(), inplace=True)
# checking the number of null values
null_values = cohort_users.isna().sum().sum()
print(f'We have {null_values} Null values in the dataset.')
We have 0 Null values in the dataset.
negative_numbers = cohort_users[cohort_users['avg_stay_duration_day'] < 0].shape[0]
percentage = round(cohort_users[cohort_users['avg_stay_duration_day'] < 0].shape[0]/
cohort_users.avg_stay_duration_day.count()*100,2)
print(f'Only {negative_numbers} records ({percentage}%) of avg_stay_duration_day column have negative amounts.')
Only 2 records (0.03%) of avg_stay_duration_day column have negative amounts.
# replacing the records with 0
cohort_users.loc[cohort_users['avg_stay_duration_day'] < 0, 'avg_stay_duration_day'] = 0
negative_numbers = cohort_users[cohort_users['avg_stay_duration_day'] < 0].shape[0]
# checking how many records we have with this criteria
print(f'Now {negative_numbers} records of the avg_stay_duration column is negative.')
Now 0 records of the avg_stay_duration column is negative.
# Select only numerical columns and excluding the user_id column
numerical_columns = [col for col in cohort_users.select_dtypes(include=['float64', 'int64']).columns if col != 'user_id']
# Set up the figure and axes
fig, axes = plt.subplots(len(numerical_columns), 2, figsize=(15, 4 * len(numerical_columns)))
for i, col in enumerate(numerical_columns):
# Compute the outlier bounds: I commented out the lower_bound as in all the columns it's out of the range
#lower_bound = cohort_users[col].mean() - 5 * cohort_users[col].std()
upper_bound = cohort_users[col].mean() + 5 * cohort_users[col].std()
mean = cohort_users[col].mean()
# Plot histogram
bins = int(np.log2(len(cohort_users[col])) + 1) # Sturges' formula
sns.histplot(cohort_users[col], ax=axes[i, 0], bins=bins, kde=True, edgecolor='black')
axes[i, 0].set_title(f'Histogram of {col}')
#axes[i, 0].axvline(lower_bound, color='r', linestyle='--', label=f'-5*STD')
axes[i, 0].axvline(upper_bound, color='r', linestyle='--', label=f'5*STD')
axes[i, 0].axvline(mean, color='r', linestyle='--', label='Mean')
axes[i, 0].set_xlabel(col)
axes[i, 0].set_ylabel('Frequency')
axes[i, 0].legend()
# Plot box plot
sns.boxplot(x=cohort_users[col], ax=axes[i, 1], showfliers=True, boxprops=dict(alpha=0.7))
axes[i, 1].axvline(upper_bound, color='r', linestyle='--', label=f'5*STD') # Draw the 5*STD line on boxplot
axes[i, 1].set_title(f'Boxplot of {col}')
axes[i, 1].legend()
# Adjust layout
plt.tight_layout()
plt.show()
# we had 5998 records
number_of_outliers = 5998 - cohort_users.shape[0]
percentage = round(number_of_outliers/5998 * 100,2)
print(f'{number_of_outliers} ({percentage}%) records were outliers!')
329 (5.49%) records were outliers!
# Calculating the average_dollar_saved_per_kilometre metric
avg_usd_saved_per_km = round(cohort_users['avg_flight_discount_amount']
* cohort_users['avg_flight_price_usd']
/ cohort_users['avg_distance_flown_km'],2)
# Determine the position to insert the new column after the last_min_booking columns
position = cohort_users.columns.get_loc('avg_distance_flown_km') + 1
# Insert the new column which is the maximum of the both columns at the desired position
cohort_users.insert(position, 'avg_usd_saved_per_km', avg_usd_saved_per_km)
# List of columns to scale
cols_to_scale = ['married', 'has_children', 'age', 'session_count', 'avg_session_duration_minute', 'avg_page_clicks',
'total_trips', 'conversion_rate', 'weekend_trip_proportion', 'cancellation_proportion',
'avg_booking_departure_gap_days', 'total_flights_booked', 'round_trips_proportion',
'avg_flight_price_usd', 'avg_flight_discount_amount', 'discounted_flight_proportion', 'avg_flight_seats',
'avg_checked_bags', 'avg_distance_flown_km', 'avg_usd_saved_per_km', 'total_hotels_booked',
'avg_hotel_price_usd', 'avg_hotel_discount_amount', 'discounted_hotel_proportion',
'avg_hotel_rooms', 'avg_stay_duration_day']
# Initialize the StandardScaler
scaler = StandardScaler()
# Apply scaling and create new columns with suffix "_scaled"
scaled_data = scaler.fit_transform(cohort_users[cols_to_scale])
scaled_columns = ["scaled_" + col for col in cols_to_scale]
cohort_users[scaled_columns] = scaled_data.round(2)
# check the dataset
cohort_users.head()
| user_id | sign_up_date | age | gender | married | has_children | home_country | home_city | session_count | avg_session_duration_minute | avg_page_clicks | total_trips | conversion_rate | weekend_trip_proportion | cancellation_proportion | avg_booking_departure_gap_days | total_flights_booked | round_trips_proportion | avg_flight_price_usd | avg_flight_discount_amount | discounted_flight_proportion | avg_flight_seats | avg_checked_bags | avg_distance_flown_km | avg_usd_saved_per_km | total_hotels_booked | avg_hotel_price_usd | avg_hotel_discount_amount | discounted_hotel_proportion | avg_hotel_rooms | avg_stay_duration_day | has_booked | has_booked_flight | has_booked_hotel | scaled_married | scaled_has_children | scaled_age | scaled_session_count | scaled_avg_session_duration_minute | scaled_avg_page_clicks | scaled_total_trips | scaled_conversion_rate | scaled_weekend_trip_proportion | scaled_cancellation_proportion | scaled_avg_booking_departure_gap_days | scaled_total_flights_booked | scaled_round_trips_proportion | scaled_avg_flight_price_usd | scaled_avg_flight_discount_amount | scaled_discounted_flight_proportion | scaled_avg_flight_seats | scaled_avg_checked_bags | scaled_avg_distance_flown_km | scaled_avg_usd_saved_per_km | scaled_total_hotels_booked | scaled_avg_hotel_price_usd | scaled_avg_hotel_discount_amount | scaled_discounted_hotel_proportion | scaled_avg_hotel_rooms | scaled_avg_stay_duration_day | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 23557 | 2021-07-22 | 64 | F | True | False | usa | new york | 12 | 6.93 | 17.50 | 4 | 0.33 | 0.00 | 0.25 | 248.05 | 2 | 1.0 | 518.29 | 0.15 | 0.21 | 1.00 | 1.00 | 2680.73 | 0.03 | 4 | 157.33 | 0.18 | 0.29 | 1.33 | 7.33 | True | True | True | 1.13 | -0.69 | 1.93 | 4.81 | 4.98 | -0.04 | 0.73 | -0.05 | -0.38 | 2.59 | 2.07 | -0.27 | 0.5 | 0.15 | 0.46 | 0.21 | -0.06 | 1.10 | 0.45 | 0.31 | 0.95 | -0.24 | 1.52 | 1.22 | 0.52 | 1.44 |
| 1 | 94883 | 2022-02-07 | 51 | F | True | False | usa | kansas city | 12 | 0.50 | 8.33 | 3 | 0.25 | 0.00 | 0.00 | 60.98 | 3 | 1.0 | 1784.95 | 0.10 | 0.08 | 1.67 | 1.00 | 5057.35 | 0.04 | 2 | 90.00 | 0.08 | 0.17 | 1.50 | 0.50 | True | True | True | 1.13 | -0.69 | 0.83 | 4.81 | -1.06 | -0.92 | 0.10 | -0.48 | -0.38 | -0.26 | 0.18 | 0.36 | 0.5 | 3.62 | -0.13 | -0.75 | 1.23 | 1.10 | 2.51 | 0.79 | -0.37 | -1.16 | 0.06 | 0.24 | 0.89 | -1.03 |
| 3 | 101961 | 2022-02-17 | 43 | F | True | False | usa | boston | 12 | 2.00 | 18.17 | 7 | 0.58 | 0.14 | 0.00 | 7.34 | 6 | 1.0 | 321.53 | 0.13 | 0.25 | 1.00 | 0.50 | 1753.36 | 0.02 | 7 | 150.29 | 0.10 | 0.08 | 1.00 | 3.14 | True | True | True | 1.13 | -0.69 | 0.15 | 4.81 | 0.35 | 0.03 | 2.61 | 1.29 | 0.82 | -0.26 | -0.36 | 2.26 | 0.5 | -0.39 | 0.22 | 0.51 | -0.06 | -0.00 | -0.35 | -0.17 | 2.94 | -0.34 | 0.35 | -0.49 | -0.19 | -0.08 |
| 9 | 149058 | 2022-04-14 | 50 | F | False | True | usa | birmingham | 12 | 2.79 | 15.36 | 6 | 0.50 | 0.00 | 0.17 | 58.79 | 6 | 1.0 | 664.40 | 0.05 | 0.21 | 1.56 | 0.67 | 2304.81 | 0.01 | 6 | 165.67 | 0.00 | 0.14 | 1.11 | 8.56 | True | True | True | -0.89 | 1.45 | 0.75 | 4.81 | 1.09 | -0.24 | 1.99 | 0.86 | -0.38 | 1.68 | 0.16 | 2.26 | 0.5 | 0.55 | -0.72 | 0.21 | 1.01 | 0.37 | 0.13 | -0.65 | 2.28 | -0.13 | -1.11 | 0.00 | 0.05 | 1.88 |
| 10 | 152583 | 2022-04-17 | 37 | F | False | False | usa | colorado springs | 10 | 6.17 | 21.25 | 2 | 0.20 | 0.00 | 0.50 | 181.16 | 2 | 1.0 | 3769.43 | 0.15 | 0.25 | 1.80 | 1.20 | 10253.52 | 0.06 | 2 | 213.00 | 0.15 | 0.42 | 1.00 | 2.00 | True | True | True | -0.89 | -0.69 | -0.35 | 2.13 | 4.27 | 0.33 | -0.53 | -0.75 | -0.38 | 5.43 | 1.40 | -0.27 | 0.5 | 9.06 | 0.46 | 0.51 | 1.47 | 1.54 | 6.99 | 1.76 | -0.37 | 0.52 | 1.08 | 2.27 | -0.19 | -0.49 |
# Extract the relevant columns for clustering
features = cohort_users[[
# Free hotel meal:
'scaled_has_children',
'scaled_age',
'scaled_avg_flight_seats',
'scaled_avg_hotel_rooms',
# Free checked bag:
'scaled_avg_checked_bags',
'scaled_avg_stay_duration_day',
'scaled_avg_distance_flown_km',
'scaled_round_trips_proportion',
# No cancellation fees:
'scaled_cancellation_proportion',
'scaled_avg_booking_departure_gap_days',
# Exclusive discounts:
'scaled_conversion_rate',
'scaled_avg_page_clicks',
'scaled_avg_session_duration_minute',
'scaled_avg_flight_discount_amount',
'scaled_discounted_flight_proportion',
'scaled_avg_usd_saved_per_km',
'scaled_avg_hotel_discount_amount',
'scaled_discounted_hotel_proportion',
# 1-night free hotel with a flight:
'scaled_total_flights_booked',
'scaled_total_hotels_booked',
'scaled_weekend_trip_proportion',
'scaled_total_trips'
]]
# Compute the correlation matrix for the free_meal_index
correlation_matrix = features.corr()
# Set up the matplotlib figure
plt.figure(figsize=(15, 10))
# Generate a heatmap for the correlation matrix
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", vmin=-1, vmax=1, linewidths=0.5, annot_kws={"size": 8})
# Adjust the x-axis labels for better readability
plt.xticks(rotation=90, ha='right')
# Set the title and show the plot
plt.title("Features Correlation Matrix Heatmap")
plt.tight_layout()
plt.show()
# Extract the relevant columns for clustering
features = cohort_users[[
# Free hotel meal:
#'scaled_has_children',
#'scaled_age',
#'scaled_avg_flight_seats',
'scaled_avg_hotel_rooms',
# Free checked bag:
'scaled_avg_checked_bags',
#'scaled_avg_stay_duration_day',
#'scaled_avg_distance_flown_km',
#'scaled_round_trips_proportion',
# No cancellation fees:
'scaled_cancellation_proportion',
#'scaled_avg_booking_departure_gap_days',
# Exclusive discounts:
'scaled_conversion_rate',
#'scaled_avg_page_clicks',
#'scaled_avg_session_duration_minute',
#'scaled_avg_flight_discount_amount',
#'scaled_discounted_flight_proportion',
#'scaled_avg_usd_saved_per_km',
#'scaled_avg_hotel_discount_amount',
#'scaled_discounted_hotel_proportion',
# 1-night free hotel with a flight:
#'scaled_total_flights_booked',
#'scaled_total_hotels_booked',
'scaled_weekend_trip_proportion',
#'scaled_total_trips'
]]
# Display the first few rows of the selected features
features.head()
| scaled_avg_hotel_rooms | scaled_avg_checked_bags | scaled_cancellation_proportion | scaled_conversion_rate | scaled_weekend_trip_proportion | |
|---|---|---|---|---|---|
| 0 | 0.52 | 1.10 | 2.59 | -0.05 | -0.38 |
| 1 | 0.89 | 1.10 | -0.26 | -0.48 | -0.38 |
| 3 | -0.19 | -0.00 | -0.26 | 1.29 | 0.82 |
| 9 | 0.05 | 0.37 | 1.68 | 0.86 | -0.38 |
| 10 | -0.19 | 1.54 | 5.43 | -0.75 | -0.38 |
def get_silhouette_score(features, n_clusters):
data_sample = features.sample(frac=1, random_state=1)
model = KMeans(n_clusters=n_clusters, random_state=1)
model.fit(data_sample)
model_sil_score = silhouette_score(data_sample, model.labels_)
print(f"The silhouette score using {n_clusters} number of clusters is: {round(model_sil_score, 3)}")
# calling the function for different number of clusters
get_silhouette_score(features, 3)
get_silhouette_score(features, 4)
get_silhouette_score(features, 5)
get_silhouette_score(features, 6)
get_silhouette_score(features, 7)
The silhouette score using 3 number of clusters is: 0.314 The silhouette score using 4 number of clusters is: 0.367 The silhouette score using 5 number of clusters is: 0.351 The silhouette score using 6 number of clusters is: 0.378 The silhouette score using 7 number of clusters is: 0.394
# Retrieving the cluster labels assigned to each data point in our dataset.
cohort_users['cluster_label'] = model.labels_
model.labels_
array([2, 0, 3, ..., 4, 4, 1], dtype=int32)
cohort_users.head()
| user_id | sign_up_date | age | gender | married | has_children | home_country | home_city | session_count | avg_session_duration_minute | avg_page_clicks | total_trips | conversion_rate | weekend_trip_proportion | cancellation_proportion | avg_booking_departure_gap_days | total_flights_booked | round_trips_proportion | avg_flight_price_usd | avg_flight_discount_amount | discounted_flight_proportion | avg_flight_seats | avg_checked_bags | avg_distance_flown_km | avg_usd_saved_per_km | total_hotels_booked | avg_hotel_price_usd | avg_hotel_discount_amount | discounted_hotel_proportion | avg_hotel_rooms | avg_stay_duration_day | has_booked | has_booked_flight | has_booked_hotel | scaled_married | scaled_has_children | scaled_age | scaled_session_count | scaled_avg_session_duration_minute | scaled_avg_page_clicks | scaled_total_trips | scaled_conversion_rate | scaled_weekend_trip_proportion | scaled_cancellation_proportion | scaled_avg_booking_departure_gap_days | scaled_total_flights_booked | scaled_round_trips_proportion | scaled_avg_flight_price_usd | scaled_avg_flight_discount_amount | scaled_discounted_flight_proportion | scaled_avg_flight_seats | scaled_avg_checked_bags | scaled_avg_distance_flown_km | scaled_avg_usd_saved_per_km | scaled_total_hotels_booked | scaled_avg_hotel_price_usd | scaled_avg_hotel_discount_amount | scaled_discounted_hotel_proportion | scaled_avg_hotel_rooms | scaled_avg_stay_duration_day | cluster_label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 23557 | 2021-07-22 | 64 | F | True | False | usa | new york | 12 | 6.93 | 17.50 | 4 | 0.33 | 0.00 | 0.25 | 248.05 | 2 | 1.0 | 518.29 | 0.15 | 0.21 | 1.00 | 1.00 | 2680.73 | 0.03 | 4 | 157.33 | 0.18 | 0.29 | 1.33 | 7.33 | True | True | True | 1.13 | -0.69 | 1.93 | 4.81 | 4.98 | -0.04 | 0.73 | -0.05 | -0.38 | 2.59 | 2.07 | -0.27 | 0.5 | 0.15 | 0.46 | 0.21 | -0.06 | 1.10 | 0.45 | 0.31 | 0.95 | -0.24 | 1.52 | 1.22 | 0.52 | 1.44 | 2 |
| 1 | 94883 | 2022-02-07 | 51 | F | True | False | usa | kansas city | 12 | 0.50 | 8.33 | 3 | 0.25 | 0.00 | 0.00 | 60.98 | 3 | 1.0 | 1784.95 | 0.10 | 0.08 | 1.67 | 1.00 | 5057.35 | 0.04 | 2 | 90.00 | 0.08 | 0.17 | 1.50 | 0.50 | True | True | True | 1.13 | -0.69 | 0.83 | 4.81 | -1.06 | -0.92 | 0.10 | -0.48 | -0.38 | -0.26 | 0.18 | 0.36 | 0.5 | 3.62 | -0.13 | -0.75 | 1.23 | 1.10 | 2.51 | 0.79 | -0.37 | -1.16 | 0.06 | 0.24 | 0.89 | -1.03 | 0 |
| 3 | 101961 | 2022-02-17 | 43 | F | True | False | usa | boston | 12 | 2.00 | 18.17 | 7 | 0.58 | 0.14 | 0.00 | 7.34 | 6 | 1.0 | 321.53 | 0.13 | 0.25 | 1.00 | 0.50 | 1753.36 | 0.02 | 7 | 150.29 | 0.10 | 0.08 | 1.00 | 3.14 | True | True | True | 1.13 | -0.69 | 0.15 | 4.81 | 0.35 | 0.03 | 2.61 | 1.29 | 0.82 | -0.26 | -0.36 | 2.26 | 0.5 | -0.39 | 0.22 | 0.51 | -0.06 | -0.00 | -0.35 | -0.17 | 2.94 | -0.34 | 0.35 | -0.49 | -0.19 | -0.08 | 3 |
| 9 | 149058 | 2022-04-14 | 50 | F | False | True | usa | birmingham | 12 | 2.79 | 15.36 | 6 | 0.50 | 0.00 | 0.17 | 58.79 | 6 | 1.0 | 664.40 | 0.05 | 0.21 | 1.56 | 0.67 | 2304.81 | 0.01 | 6 | 165.67 | 0.00 | 0.14 | 1.11 | 8.56 | True | True | True | -0.89 | 1.45 | 0.75 | 4.81 | 1.09 | -0.24 | 1.99 | 0.86 | -0.38 | 1.68 | 0.16 | 2.26 | 0.5 | 0.55 | -0.72 | 0.21 | 1.01 | 0.37 | 0.13 | -0.65 | 2.28 | -0.13 | -1.11 | 0.00 | 0.05 | 1.88 | 2 |
| 10 | 152583 | 2022-04-17 | 37 | F | False | False | usa | colorado springs | 10 | 6.17 | 21.25 | 2 | 0.20 | 0.00 | 0.50 | 181.16 | 2 | 1.0 | 3769.43 | 0.15 | 0.25 | 1.80 | 1.20 | 10253.52 | 0.06 | 2 | 213.00 | 0.15 | 0.42 | 1.00 | 2.00 | True | True | True | -0.89 | -0.69 | -0.35 | 2.13 | 4.27 | 0.33 | -0.53 | -0.75 | -0.38 | 5.43 | 1.40 | -0.27 | 0.5 | 9.06 | 0.46 | 0.51 | 1.47 | 1.54 | 6.99 | 1.76 | -0.37 | 0.52 | 1.08 | 2.27 | -0.19 | -0.49 | 2 |
# Constructing a DataFrame to represent the cluster centroids with respective feature values.
model_centroids = pd.DataFrame(model.cluster_centers_, columns=features.columns)
# Displaying the coordinates/values of each cluster centroid in a tabular format.
display(model_centroids)
# Plotting a heatmap to visually represent the position and magnitude of each cluster centroid in the feature space.
plt.figure(figsize=(10, 6))
sns.heatmap(model_centroids.T, cmap="Blues", annot=True)
plt.xlabel("Cluster centroids", fontsize=14)
plt.tight_layout()
plt.title("Centroid positions in high-dimensional space", fontsize=15)
| scaled_avg_hotel_rooms | scaled_avg_checked_bags | scaled_cancellation_proportion | scaled_conversion_rate | scaled_weekend_trip_proportion | |
|---|---|---|---|---|---|
| 0 | 0.203140 | 0.718430 | -0.252331 | 0.398117 | -0.373519 |
| 1 | -2.340000 | -0.966199 | -0.260000 | -1.723496 | -0.380000 |
| 2 | 0.171943 | 0.342979 | 3.462332 | 0.321010 | 0.004534 |
| 3 | 0.215061 | 0.108494 | -0.238060 | 0.628589 | 2.301248 |
| 4 | 0.258524 | -0.790872 | -0.260000 | -0.372912 | -0.380000 |
Text(0.5, 1.0, 'Centroid positions in high-dimensional space')
cluster_dict = {0:'Free Checked Bag',
1: 'Exclusive Discounts',
2: 'No Cancellation Fee',
3: '1-night free hotel with a flight',
4: 'Free Hotel Meal'}
# Add a new column to the DataFrame to store the perk names
cohort_users['perk'] = cohort_users['cluster_label'].map(cluster_dict)
# Count the number of users in each segment
segment_counts = cohort_users['perk'].value_counts().reset_index()
segment_counts.columns = ['perk', 'user_count']
# Display the number of users in each segment
segment_counts
| perk | user_count | |
|---|---|---|
| 0 | Free Checked Bag | 2299 |
| 1 | Free Hotel Meal | 1755 |
| 2 | 1-night free hotel with a flight | 737 |
| 3 | Exclusive Discounts | 492 |
| 4 | No Cancellation Fee | 386 |
grouped_df = features.join(cohort_users['perk']).groupby('perk').mean()
grouped_df
| scaled_avg_hotel_rooms | scaled_avg_checked_bags | scaled_cancellation_proportion | scaled_conversion_rate | scaled_weekend_trip_proportion | |
|---|---|---|---|---|---|
| perk | |||||
| 1-night free hotel with a flight | 0.215061 | 0.108494 | -0.238060 | 0.628589 | 2.301248 |
| Exclusive Discounts | -2.340000 | -0.966199 | -0.260000 | -1.723496 | -0.380000 |
| Free Checked Bag | 0.203140 | 0.718430 | -0.252331 | 0.398117 | -0.373519 |
| Free Hotel Meal | 0.258524 | -0.790872 | -0.260000 | -0.372912 | -0.380000 |
| No Cancellation Fee | 0.171943 | 0.342979 | 3.462332 | 0.321010 | 0.004534 |
filtered_columns = [col for col in grouped_df.columns if col != 'scaled_cancellation_proportion']
# Extract unique segment labels from the 'perk' column
segments = cohort_users['perk'].unique()
# Calculate the number of rows and columns for the subplot grid
n_rows = len(filtered_columns) # One row per feature
n_cols = 2 # Two columns per row
# Create subplots with explicit layout
fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 4 * len(filtered_columns)))
# Loop through each feature and create KDE plots for each segment
for i, feature in enumerate(filtered_columns):
row, col = divmod(i, n_cols) # Calculate row and column for current subplot
for segment in segments:
sns.kdeplot(
cohort_users[cohort_users['perk'] == segment][feature],
label=segment,
ax=axes[row, col]
)
axes[row, col].set_title(f'Distribution of {feature} Within Each Segment')
axes[row, col].set_xlabel(feature)
axes[row, col].set_ylabel('Density')
axes[row, col].legend(title='Segment')
# Hide any remaining empty subplots
for i in range(len(filtered_columns), n_rows * n_cols):
row, col = divmod(i, n_cols) # Calculate row and column for current subplot
fig.delaxes(axes[row, col])
plt.tight_layout()
plt.show()
# Calculate the number of rows for the subplot grid (one row per feature)
n_rows = len(features.columns)
# Create subplots with explicit layout, one subplot per row
fig, axes = plt.subplots(n_rows, 1, figsize=(12, 4 * len(features.columns)))
# Loop through each feature to create box plots for each segment
for i, feature in enumerate(features.columns):
sns.boxplot(y='perk', x=feature, data=cohort_users, ax=axes[i])
axes[i].set_title(f'Box Plot of {feature} Across Each Perk')
axes[i].set_xlabel('Feature Values')
axes[i].set_ylabel('Perk')
plt.tight_layout()
plt.show()
from sklearn.metrics.pairwise import euclidean_distances
# Make a copy of the DataFrame slice
features_copy = features.copy()
# Add the cluster labels to the DataFrame containing the features
features_copy['cluster_label'] = cohort_users['cluster_label']
features_copy['perk'] = cohort_users['perk']
# Initialize an empty column to store the distances to the cluster centroid
features_copy['dist_center'] = None
# Loop through each cluster to calculate the distance of each point to its cluster's centroid
for cluster in model_centroids.index:
cluster_sample = features_copy.loc[features_copy['cluster_label'] == cluster]
features_copy.loc[features_copy['cluster_label'] == cluster, 'dist_center'] = euclidean_distances(
cluster_sample.iloc[:, :-3], # Exclude the last three columns: 'cluster_label', 'perk', 'dist_center'
model_centroids.loc[cluster, :].values.reshape(1, -1)
)
# Calculate the average distance of the data points in each cluster to their respective centroid
avg_dist_to_centroid = features_copy.groupby('perk', as_index=False)['dist_center'].mean()
avg_dist_to_centroid
| perk | dist_center | |
|---|---|---|
| 0 | 1-night free hotel with a flight | 1.500694 |
| 1 | Exclusive Discounts | 0.331363 |
| 2 | Free Checked Bag | 1.209937 |
| 3 | Free Hotel Meal | 1.024512 |
| 4 | No Cancellation Fee | 2.027350 |
# Define the function to draw representative examples from each cluster (perk category in this case)
def draw_examples(df, perk, max_dist, n):
print(f"Examples from cluster {perk} with less than {max_dist} distance to the cluster centroid")
return df.loc[(df['perk'] == perk) & (df['dist_center'] < max_dist)].sample(n)
# Draw 5 representative examples from the 'No Cancellation Fee' cluster
# Replace 0.4 with the appropriate maximum distance based on your specific dataset
draw_examples(features_copy, perk='Exclusive Discounts', max_dist=0.4, n=5)
Examples from cluster Exclusive Discounts with less than 0.4 distance to the cluster centroid
| scaled_avg_hotel_rooms | scaled_avg_checked_bags | scaled_cancellation_proportion | scaled_conversion_rate | scaled_weekend_trip_proportion | cluster_label | perk | dist_center | |
|---|---|---|---|---|---|---|---|---|
| 5494 | -2.34 | -1.11 | -0.26 | -1.82 | -0.38 | 1 | Exclusive Discounts | 0.173181 |
| 1407 | -2.34 | -1.11 | -0.26 | -1.82 | -0.38 | 1 | Exclusive Discounts | 0.173181 |
| 5771 | -2.34 | -1.11 | -0.26 | -1.82 | -0.38 | 1 | Exclusive Discounts | 0.173181 |
| 1224 | -2.34 | -1.11 | -0.26 | -1.82 | -0.38 | 1 | Exclusive Discounts | 0.173181 |
| 3225 | -2.34 | -1.11 | -0.26 | -1.82 | -0.38 | 1 | Exclusive Discounts | 0.173181 |
# Calculate various metrics for each cluster (perk)
cluster_metrics = features_copy.groupby('perk', as_index=False).agg({
'scaled_conversion_rate': 'mean',
'scaled_avg_hotel_rooms': 'mean',
'scaled_avg_checked_bags': 'mean',
'scaled_cancellation_proportion': 'mean',
'scaled_weekend_trip_proportion': 'mean'
})
# Rename columns for clarity
cluster_metrics.columns = ['Perk', 'Avg Conversion Rate', 'Avg Hotel Rooms', 'Avg Checked Bags',
'Avg Cancellation Proportion', 'Avg Weekend Trip Proportion']
# Display the calculated metrics as a table
cluster_metrics
| Perk | Avg Conversion Rate | Avg Hotel Rooms | Avg Checked Bags | Avg Cancellation Proportion | Avg Weekend Trip Proportion | |
|---|---|---|---|---|---|---|
| 0 | 1-night free hotel with a flight | 0.628589 | 0.215061 | 0.108494 | -0.238060 | 2.301248 |
| 1 | Exclusive Discounts | -1.723496 | -2.340000 | -0.966199 | -0.260000 | -0.380000 |
| 2 | Free Checked Bag | 0.398117 | 0.203140 | 0.718430 | -0.252331 | -0.373519 |
| 3 | Free Hotel Meal | -0.372912 | 0.258524 | -0.790872 | -0.260000 | -0.380000 |
| 4 | No Cancellation Fee | 0.321010 | 0.171943 | 0.342979 | 3.462332 | 0.004534 |
# Count the number of customers in each segment
segment_count = cohort_users['perk'].value_counts().reset_index()
segment_count.columns = ['Segment', 'Number of Customers']
# Plotting the bar chart
plt.figure(figsize=(12, 6))
sns.barplot(x='Number of Customers', y='Segment', data=segment_count, palette='cool')
plt.title('Distribution of Customers Across Different Segments')
plt.xlabel('Number of Customers')
plt.ylabel('Segment')
# Annotate bars with count values
for index, value in enumerate(segment_count['Number of Customers']):
plt.text(value, index, str(value), ha='center', va='center', fontsize=10, color='black')
#plt.tight_layout()
plt.show()
# to use it in the presentation file:
def func(pct, allvalues):
absolute = int(pct/100.*np.sum(allvalues))
return "{:.1f}%\n({:d} users)".format(pct, absolute)
# Data to plot
sizes = segment_count['Number of Customers']
labels = segment_count['Segment']
colors = sns.color_palette('cool', len(labels))
# Plotting the Pie chart
plt.figure(figsize=(7,7))
plt.pie(sizes, labels=labels, colors=colors, autopct=lambda pct: func(pct, sizes), startangle=140, pctdistance=0.85)
# Drawing center circle for 'Donut' style
centre_circle = plt.Circle((0,0),0.70,fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
# Use the y parameter to adjust the position of the title
plt.title('Distribution of Customers Across Different Segments', y=1.08)
# Equal aspect ratio ensures that pie is drawn as a circle
plt.axis('equal')
plt.tight_layout()
plt.show()
# Calculate the average age for each segment
avg_age_segment = cohort_users.groupby('perk')['age'].mean().reset_index()
avg_age_segment.columns = ['Segment', 'Average Age']
# Plotting the bar chart for average age
plt.figure(figsize=(12, 6))
ax = sns.barplot(x='Average Age', y='Segment', data=avg_age_segment, palette='cool')
# Annotate the average age of each segment
for index, value in enumerate(avg_age_segment['Average Age']):
ax.text(value - 0.5, index, str(round(value, 2)))
plt.title('Average Age of Customers Across Different Segments')
plt.xlabel('Average Age')
plt.ylabel('Segment')
plt.show()
# Gender analysis: drawing the countplot for gender distribution across different perks using
plt.figure(figsize=(12, 6))
sns.countplot(data=cohort_users, x='perk', hue='gender', palette="cool")
plt.title('Gender Distribution by Perk')
plt.xlabel('Perk')
plt.ylabel('Number of Customers')
plt.xticks(rotation=15)
plt.show()
# Plotting the distribution of customers across top 5 cities and their most preferred perk
top_5_cities = cohort_users['home_city'].value_counts().nlargest(5).index.tolist()
# Filter the data to only include the top 5 cities
top_5_cities_data = cohort_users[cohort_users['home_city'].isin(top_5_cities)]
# Generate a horizontal bar chart for the top 5 cities
plt.figure(figsize=(15, 8))
sns.countplot(data=top_5_cities_data, y='home_city', hue='perk', palette="cool")
plt.title('Top 5 Cities: Distribution of Customers by Preferred Perk')
plt.xlabel('Number of Customers')
plt.ylabel('City')
plt.legend(title='Preferred Perk', bbox_to_anchor=(1, 1), loc='upper left')
plt.tight_layout()
plt.show()
final_segmentation_df = cohort_users[['user_id','perk']]
# Save the DataFrame to a CSV file
final_segmentation_df.to_csv('TravelTide_Customer_Segmentation_Kmean.csv', index=False)
# Display the first few rows of the final DataFrame
final_segmentation_df.head()
| user_id | perk | |
|---|---|---|
| 0 | 23557 | No Cancellation Fee |
| 1 | 94883 | Free Checked Bag |
| 3 | 101961 | 1-night free hotel with a flight |
| 9 | 149058 | No Cancellation Fee |
| 10 | 152583 | No Cancellation Fee |